# Commands to start an interactive session on the JHPCE cluster
qrsh -l mem_free=20G,h_vmem=20G
module load conda_R
cd /fastscratch/myscratch/akuo/alsf-filbin
R
library(here)
dataset = "mrna" # options = "premrna_mrna", "mrna", or "premrna"
First, we create a table with information about where each file (quantified counts from salmon) is. This will be used to create a SummarizedExperiment object.
# list tumor names
tumor_names <- list.files(here("sample_data"))[
!grepl("*.txt", list.files(here("sample_data")))]
unique_sf_paths <- NULL
for(tum in tumor_names){
ids <- list.files(here("sample_data", tum))
ids <- unique(stringr::str_sub(ids, end=-13))
ids <- here(paste0("salmon_quants_", dataset), paste0(ids, "_quant"), "quant.sf")
unique_sf_paths <- c(unique_sf_paths, ids)
}
unique_sf_ids <- NULL
for(tum in tumor_names){
ids <- list.files(here("sample_data", tum))
ids <- unique(stringr::str_sub(ids, end=-13))
unique_sf_ids <- c(unique_sf_ids, ids)
}
coldata <- data.frame(files = unique_sf_paths, names = unique_sf_ids)
We also need to use the linkedTxome object to use tximeta properly, i.e. rowRanges(se) won’t be NULL and tximeta will be able to match the transcripts to the genes for us. Note: still doesn’t work
suppressPackageStartupMessages({
library(tximeta)
library(BiocFileCache)
})
# check if linkedTxome is already in the cache
bfcloc <- getTximetaBFC()
bfc <- BiocFileCache(bfcloc)
bfcinfo(bfc)
# if not, load linkedTxome json file
json_file <- here("salmon_files", "gencode.v32_salmon-index-v1.0.0.json")
loadLinkedTxome(json_file)
The only way I’ve figured out how to make this work for now is with skipMeta = TRUE.
se_file_name = here(paste0("salmon_quants_", dataset), paste0("se_", dataset, ".rds"))
# coldata = coldata[1:2, ] # for testing
if(!file.exists(se_file_name)){
# Create SummarizedExperiment object
if(dataset == "premrna")
se <- tximeta(coldata, skipMeta = TRUE) # Takes a few minutes, file size = 597 MB
else if (dataset == "mrna")
se <- tximeta(coldata) # run this line if you used mRNA transcripts in your index only, it will automatically detect the right transcriptome
# se <- tximeta(coldata, ignoreAfterBar = TRUE)
saveRDS(se, se_file_name)
} else {
se = readRDS(se_file_name) # Takes a couple of seconds
}
suppressPackageStartupMessages({
library(SummarizedExperiment)
library(DESeq2)
library(scater)
})
colData(se)
assayNames(se)
rowRanges(se)
dat = assay(se, "counts")
A SingleCellExperiment class is derived from the SummarizedExperiment class. The most important change is the addition of a new slot called reducedDims. Read more here.
# BiocManager::install('SingleCellExperiment')
library(SingleCellExperiment)
sce = SingleCellExperiment(assays = list(counts = dat))
# you can access counts by assay(sce, "counts") or counts(sce)
# you can add a new entry to assays slot by assay(sce, "counts_new") = dat_new
# This will take a long time (> 45 min); should probably separate out as a script
sce <- getBMFeatureAnnos(sce,
ids = rownames(sce),
filters = "ensembl_gene_id",
attributes = c("ensembl_gene_id", "hgnc_symbol",
"chromosome_name", "gene_biotype",
"start_position", "end_position"),
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = "www.ensembl.org")
rowData(sce)$gene_name_unique <-
uniquifyFeatureNames(rowData(sce)$gene_name_ensembl,
rowData(sce)$hgnc_symbol)
There are a couple of QC metrics to identify low-quality cells:
low_lib_size or (b) few expressed endogeneous genes (nonzero counts for those genes) low_n_featuresI will only do (1) for now.
library(scater)
# Compute quality control metrics:
# sum is the total count for each cell
# detected contains the number of detected genes (actually transcripts for our data)
df = perCellQCMetrics(sce)
df
# Find outliers with low library sizes (LibSize) and few detected features (n_features)
reasons = quickPerCellQC(df) # DataFrame of logical values
colSums(as.matrix(reasons))
# Discard outliers
filtered = sce[, !reasons$discard]
dat_filtered = counts(filtered) # 226608 x 518 for mrna, 221988 x 546 for premrn
sce_filtered = SingleCellExperiment(assays = list(counts = dat_filtered))
filtered_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_filtered_", dataset, ".rds"))
saveRDS(sce_filtered, filtered_file_name)
Diagnostic plots: https://osca.bioconductor.org/quality-control.html#quality-control-plots
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
dataset = "mrna" # options = "premrna_mrna", "mrna", or "premrna"
Run compute-mle.sh to get mle sig (“shape”) parameter first. There will be a parameter for every cell. It will take about 30 minutes to an hour.
# Load filtered counts
filtered_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_filtered_", dataset, ".rds"))
if(file.exists(filtered_file_name)){
sce_filtered = readRDS(filtered_file_name)
dat_filtered = counts(sce_filtered)
}
# Read in mle parameters
mle_file_names = list.files(here(paste0("mle_results_", dataset)), full.names = TRUE)
mle_results = lapply(mle_file_names, readRDS)
sig_vec = sapply(mle_results, function(r) r["sig"])
mu_vec = sapply(mle_results, function(r) r["mu"])
# Plot sig ("shape") distribution
ggplot(tibble(sig = sig_vec), aes(x = sig)) +
geom_histogram(bins = 30) +
labs(title = "Distribution of sig parameter",
x = "sig",
y = "Number of cells")
source(here("scripts", "quminorm.R"))
# Convert to pseudo-UMIs
umi_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_umi_", dataset, ".rds"))
if(file.exists(umi_file_name)){
dat_umi = readRDS(umi_file_name)
} else {
dat_umi = quminorm_matrix(dat_filtered, shape = median(sig_vec)) # Takes several minutes (20~30 min). In the paper, they use the mode, but I will use the median since it is not clear what mode means for continuous values. The median for mRNA is 3.16. The median for pre-mRNA is 3.28.
dat_umi = dat_umi[!(rowSums(dat_umi) == 0), ] # Remove transcripts with all zeros
sce_umi = SingleCellExperiment(assays = list(counts = dat_umi))
saveRDS(sce_umi, umi_file_name)
}
umi_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_umi_", dataset, ".rds"))
sce_umi = readRDS(umi_file_name)
dat_umi = counts(sce_umi)
# Counts by transcript
dim(dat_umi) # 176307 x 518 for mRNA, 173927 x 546 for premRNA
if(dataset == "mrna"){
se_mrna = readRDS(here(paste0("salmon_quants_", dataset), paste0("se_mrna.rds")))
transcripts_mrna = rownames(assay(se_mrna, "counts"))
row_ranges = rowRanges(se_mrna)
genes = unlist(elementMetadata(row_ranges)$gene_id)
mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes) %>%
filter(transcript %in% rownames(dat_umi))
dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes, reorder = FALSE)
dim(dat_gene) # 48890 x 518 for mRNA
} else if(dataset == "premrna") {
# Read in mrna data to look up genes
se_mrna = readRDS(here(paste0("salmon_quants_", dataset), paste0("se_mrna.rds")))
transcripts_mrna = rownames(assay(se_mrna, "counts"))
row_ranges = rowRanges(se_mrna)
genes = unlist(elementMetadata(row_ranges)$gene_id)
mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes)
# Get genes for every premrna transcript
# Note: 208 premRNA transcripts did not have corresponding mRNA transcripts
transcripts_premrna = rownames(dat_umi)
transcripts_premrna_converted = gsub(".{8}$", "" , transcripts_premrna)
premrna_gene_matching_dt = left_join(tibble(transcript = transcripts_premrna_converted),
mrna_gene_matching_dt, by = "transcript")
dat_gene = rowsum(dat_umi, group = premrna_gene_matching_dt$genes, reorder = FALSE)
# Remove counts for unmatched transcripts
dat_gene = dat_gene[!is.na(rownames(dat_gene)), ]
dim(dat_gene) # 50476 x 546
}
gene_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_gene_", dataset, ".rds"))
sce_gene = SingleCellExperiment(assays = list(counts = dat_gene))
saveRDS(sce_gene, gene_file_name)
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
library(scattermore)
library(tictoc)
sce_prem = readRDS(here(paste0("salmon_quants_premrna"), "sce_gene_premrna.rds"))
sce_mrna = readRDS(here(paste0("salmon_quants_mrna"), "sce_gene_mrna.rds"))
dat_prem = counts(sce_prem)
dat_mrna = counts(sce_mrna)
dat_prem_rowmeans = tibble(genes = rownames(dat_prem),
mean_prem = rowMeans(dat_prem))
dat_mrna_rowmeans = tibble(genes = rownames(dat_mrna),
mean_mrna = rowMeans(dat_mrna))
dat_rowmeans = full_join(dat_prem_rowmeans, dat_mrna_rowmeans, by = "genes")
tic()
dat_rowmeans %>%
ggplot(aes(x = log(mean_mrna),
y = log(mean_prem))) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(x = "Log(average gene expression with mRNA)",
y = "Log(average gene expression with pre-mRNA)") +
theme_bw()
## Warning: Removed 4924 rows containing missing values (geom_point).
toc() # 2.5 sec
## 6.623 sec elapsed
# Smoothscatter
tic()
smoothScatter(x = log(dat_rowmeans$mean_mrna),
y = log(dat_rowmeans$mean_prem))
toc() # 1 sec
## 0.497 sec elapsed
# Scattermoreplot version
tic()
d = dat_rowmeans %>%
mutate(log_mean_prem = log(mean_prem),
log_mean_mrna = log(mean_mrna)) %>%
select(log_mean_prem, log_mean_mrna)
d = d[complete.cases(d), ] %>% as.matrix()
scattermoreplot(d,
main='Scattermore version')
toc() # 0.5 sec
## 0.183 sec elapsed
# MA plot is a plot of log-intensity ratios (M) vs log-intensity averages (A)
dat_rowmeans = dat_rowmeans %>%
mutate(m = log(mean_prem) - log(mean_mrna),
a = (log(mean_prem) + log(mean_mrna)/2))
smoothScatter(x = dat_rowmeans$a, y = dat_rowmeans$m)
First, I make all the column sums the same by scaling. The count in row \(i\) and column \(j\) is transformed as \(x_{ij} = x_{ij}*\sum_i x_{ij}/median_j(\sum_i x_{ij})\).
summary(colSums(dat_prem))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2101 16590 35665 43588 61040 351921
# Apply transformation
n = round(median(colSums(dat_prem)))
dat_prem = apply(dat_prem, MARGIN = 2, function(x) x*n/sum(x))
summary(colSums(dat_prem))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35665 35665 35665 35665 35665 35665
I plot the mean and variance for every row (transcript). Under a poisson distribution, they should be the same.
plot_meanvar = function(dat_sub){
# estimate lambdas and variances for every transcript
means = rowMeans(dat_sub)
vars = apply(dat_sub, MARGIN = 1, var)
# variance = mean under Poisson distribution
tibble(means, vars) %>%
ggplot(aes(x = log(means), y = log(vars))) +
geom_point(alpha = 0.4) +
geom_abline(intercept = 0, slope = 1)
}
plot_meanvar(dat_prem) +
labs(title = "pre-mRNA")
I first compute the average expression for every row (x-axis) \(\hat{\mu}_i\) and the empirical \(P(X_i=0)\), which is the probability that for a given transcript \(i\), the count is 0.
I then compute what \(P(X_i=0)\) would be under the model assumptions of binomial or poisson, using parameters estimated from the data. In particular, for a \(Binom(n, p_i)\), \(n\) is the median number of total counts of cells and \(p_i\) is the mean proportion of counts that were in gene \(i\).
# Function to plot P(X_i = 0) against average expression level mu_i
plot_prob = function(dat_sub){
n = round(median(colSums(dat_sub)))
means = rowMeans(dat_sub)
vars = apply(dat_sub, MARGIN = 1, var)
# empirical P(X_i = 0)
emp_probs_0 = apply(dat_sub, MARGIN = 1, function(r) sum(r==0)/ncol(dat_sub))
plot_dt = tibble(means, emp_probs_0)
emp_props = rowMeans(dat_sub/colSums(dat_sub))
# Model P(X_i = 0) under Binomial
binom_probs_0 = dbinom(x = 0, size = n, prob = emp_props)
# Model P(X_i = 0) under Poisson
poiss_probs_0 = dpois(x = 0, lambda = n*emp_props)
# Model P(X_i = 0) under Negative Binomial
# Estimate size/dispersion parameter
model = lm(vars ~ 1*means + I(means^2) + 0, tibble(means, vars))
phi = 1/coef(model)["I(means^2)"]
nbinom_probs_0 = dnbinom(x = 0, size = phi, mu = n*emp_props)
# Tibble for plot
plot_lines_dt = tibble(means = means,
binomial = binom_probs_0,
poisson = poiss_probs_0,
nbinomial = nbinom_probs_0) %>%
pivot_longer(-means, names_to = "model", values_to = "probs_0")
# Plot
plt = plot_lines_dt %>%
ggplot(aes(x = log(means), y = probs_0)) +
geom_point(data = plot_dt, aes(x = log(means), y = emp_probs_0), alpha = 0.4) + # Add data points
geom_line(aes(color = model),
size = 1.5) + # Add lines for models
labs(x = "Average expression level log(E(X_i))",
y = "P(X_i = 0)") +
theme(text = element_text(size = 15))
# Return object
out = list(plot = plt,
lines_dt = plot_lines_dt)
return(out)
}
# Plot P(X_i = 0) against average expression level
prob_out_prem = plot_prob(dat_prem)
prob_out_prem$plot +
labs(title = "pre-mRNA")
summary(colSums(dat_mrna))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1781 19874 40504 49989 67076 439915
# Apply transformation
n = round(median(colSums(dat_mrna)))
dat_mrna = apply(dat_mrna, MARGIN = 2, function(x) x*n/sum(x))
summary(colSums(dat_mrna))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40504 40504 40504 40504 40504 40504
# Plot mean against variance
plot_meanvar(dat_mrna) +
labs(title = "mRNA")
# Plot P(X_i = 0) against average expression level
prob_out_mrna = plot_prob(dat_mrna)
prob_out_mrna$plot +
labs(title = "mRNA")
library(scran)
## Warning: package 'scran' was built under R version 3.6.2
library(scater)
library(BiocSingular)
## Warning: package 'BiocSingular' was built under R version 3.6.2
# library(factoextra)
library(tictoc)
library(glmpca)
source(here("scripts", "functions_genefilter.R"))
sce = sce_prem
dat = dat_prem
# Normalize and take log
clust = quickCluster(sce)
table(clust)
## clust
## 1 2 3 4
## 119 183 133 111
sce = computeSumFactors(sce, clusters = clust)
sce = logNormCounts(sce)
# Filter out highly variant genes
# filterHVG(sce)
X = assay(sce, "logcounts")
tic("approx PCA")
pc = runPCA(t(X), rank = 5, BSPARAM = IrlbaParam())
toc()
## approx PCA: 3.717 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(t(X), rank = 5, BSPARAM = RandomParam())
# toc()
# Run PCA with prcomp (~10 minutes)
# X = t(X)
# nonzero_var_cols = which(apply(X, 2, var) != 0) # Remove columns (transcripts) with zero variance
# X = X[, nonzero_var_cols]
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
ggplot(aes(PC4, PC5, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
sce = sce_mrna
dat = dat_mrna
# Normalize and take log
clust = quickCluster(sce)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
table(clust)
## clust
## 1 2 3
## 211 176 131
sce = computeSumFactors(sce, clusters = clust)
sce = logNormCounts(sce)
X = assay(sce, "logcounts")
tic("approx PCA")
pc = runPCA(t(X), rank = 5, BSPARAM = IrlbaParam())
toc()
## approx PCA: 2.79 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(t(X), rank = 5, BSPARAM = RandomParam())
# toc()
# Run PCA with prcomp (~10 minutes)
# X = t(X)
# nonzero_var_cols = which(apply(X, 2, var) != 0) # Remove columns (transcripts) with zero variance
# X = X[, nonzero_var_cols]
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
ggplot(aes(PC4, PC5, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
sce = sce_prem
dat = dat_prem
X = dat
# Takes a long time, may want to try on cluster
# Time increases with number of rows and number of latent dimensions ("L")
tic("glm PCA")
glmpc = glmpca(X, L = 3, fam = c("poi"))
toc()
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = glmpc$factors[,1], PC2 = glmpc$factors[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = glmpc$factors[,1], PC3 = glmpc$factors[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = glmpc$factors[,2], PC3 = glmpc$factors[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()